# nucs <- read.csv("EH23a.softmasked_nuccomp.csv")
# nucs$GC <- rowSums( nucs[, c("C", "c", "G", "g")] )/rowSums( nucs[, c("A", "a", "C", "c", "G", "g", "T", "t")] )
# nucs$GC <- nucs$GC * 100
gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#
gff[1:3, 1:8]
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 EH23a.chr1 annotation remark 1 65878799 . . .
## 2 EH23a.chr1 feature gene 11068 24494 . + .
## 3 EH23a.chr1 feature mRNA 11068 24494 . + .
genes <- gff[ gff[, 3] == "gene", ]
# genes[1:3, 1:8]
gff <- read.table("../Figure1/EH23a.unmasked.fasta.mod.EDTA.intact.gff3", sep = "\t", quote = "\"")
names(gff) <- c("seqid","source","type","start","end","score","strand","phase","attributes")
#gff$chrn <- sub("^.+chr", "", gff$V1)
gff$chrn <- sub("^.+chr", "", gff$seqid)
gff$chrn[ gff$chrn == "X" ] <- 10
gff$chrn <- as.numeric(gff$chrn)
gff$classification <- unlist(lapply(strsplit(gff[,9], split = ";"), function(x){ grep("Classification=", x, value = TRUE) }))
gff$classification <- sub("^Classification=", "", gff$classification)
gff$classification <- sub("^LTR/Copia", "Ty1", gff$classification)
gff$classification <- sub("^LTR/Gypsy", "Ty3", gff$classification)
#
gff[1:8, c(1:8, 10)]
## seqid source type start end score strand phase
## 1 EH23a.chr1 EDTA repeat_region 48463 58586 . - .
## 2 EH23a.chr1 EDTA target_site_duplication 48463 48467 . - .
## 3 EH23a.chr1 EDTA long_terminal_repeat 48468 50359 . - .
## 4 EH23a.chr1 EDTA Copia_LTR_retrotransposon 48468 58581 . - .
## 5 EH23a.chr1 EDTA long_terminal_repeat 56690 58581 . - .
## 6 EH23a.chr1 EDTA target_site_duplication 58582 58586 . - .
## 7 EH23a.chr1 EDTA repeat_region 67125 71966 . + .
## 8 EH23a.chr1 EDTA target_site_duplication 67125 67129 . + .
## chrn
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
# Ty3
Ty3 <- gff[ gff$classification == "Ty3", ]
Ty3 <- Ty3[ Ty3$type == "Gypsy_LTR_retrotransposon", ]
#Ty3 <- Ty3[ Ty3$V3 == "Gypsy_LTR_retrotransposon", ]
Ty3[1:8, c(1:8, 10:11)]
## seqid source type start end score strand
## 81 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2110985 2122540 . ?
## 87 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2180122 2191426 . +
## 161 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3614107 3626646 . -
## 206 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3905221 3916714 . ?
## 228 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4168059 4180534 . -
## 242 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4447228 4458755 . -
## 283 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4885633 4891899 . -
## 305 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 5232888 5239051 . +
## phase chrn classification
## 81 . 1 Ty3
## 87 . 1 Ty3
## 161 . 1 Ty3
## 206 . 1 Ty3
## 228 . 1 Ty3
## 242 . 1 Ty3
## 283 . 1 Ty3
## 305 . 1 Ty3
#table(gff$classification, gff$V1)
#tetbl <- table(gff$V1, gff$classification)
tetbl <- table(gff$seqid, gff$classification)
tetbl <- rbind(tetbl, colMeans(tetbl))
#tetbl
Ty3$motif <- sub("^motif=", "", unlist(lapply(strsplit(Ty3[, 9], split = ";"), function(x){ grep("motif", x, value = TRUE) })))
my_tbl <- table(Ty3$motif, Ty3$chrn)
#my_tbl[, 1:10]
my_mat <- as.matrix(my_tbl)
#my_mat[, 1:10]
my_mat2 <- my_mat[apply(my_mat, MARGIN = 1, function(x){ sum( x != 0 ) }) == 1, ]
my_mat2 <- my_mat2[sort.int(as.numeric(apply(my_mat2, MARGIN = 1, function(x){ paste(x, collapse = "") })),
index.return = T)$ix, ]
my_mat2
##
## 1 2 3 4 5 6 7 8 9 10
## TTTA 0 0 0 0 0 0 0 0 1 0
## TGAG 0 0 0 0 0 0 0 1 0 0
## TATC 0 0 0 0 0 0 1 0 0 0
## TATT 0 0 0 0 0 0 1 0 0 0
## TTCC 0 0 0 0 0 0 1 0 0 0
## TGTG 0 0 0 0 0 1 0 0 0 0
## TCAC 0 0 0 0 1 0 0 0 0 0
## TAGC 0 0 0 0 2 0 0 0 0 0
## TACC 0 0 0 1 0 0 0 0 0 0
## TTAG 0 0 0 1 0 0 0 0 0 0
## TTGC 0 0 0 1 0 0 0 0 0 0
## TAAT 0 0 1 0 0 0 0 0 0 0
## TCCA 0 0 1 0 0 0 0 0 0 0
## TGCT 0 0 1 0 0 0 0 0 0 0
## TAAC 0 1 0 0 0 0 0 0 0 0
## TCGT 0 1 0 0 0 0 0 0 0 0
## TCTG 0 1 0 0 0 0 0 0 0 0
## TACT 1 0 0 0 0 0 0 0 0 0
## TGTC 1 0 0 0 0 0 0 0 0 0
## TTCA 1 0 0 0 0 0 0 0 0 0
# my_cmd <- c("~/gits/hempy/bin/fast_win.py --win_size 1000000 /media/knausb/E737-9B48/releases/EH23a/EH23a.softmasked.fasta")
# my_cmd
# system(my_cmd)
# x <- read.table("EH23a.softmasked_wins.csv", header = T, sep = ",")
# #x[1:80, ]
#wins <- read.csv("EH23a.softmasked_wins1e5.csv")
wins <- read.csv("EH23a.softmasked_wins1e6.csv")
#wins <- read.csv("EH23a.softmasked_wins1e7.csv")
# wins <- read.csv("EH23a.softmasked_wins.csv")
wins$CGs <- 0
for( i in unique(wins$Id) ){
wins$CGs[ wins$Id == i] <- wins$CG[ wins$Id == i] - min(wins$CG[ wins$Id == i], na.rm = TRUE)
wins$CGs[ wins$Id == i] <- wins$CGs[ wins$Id == i]/max(wins$CGs[ wins$Id == i], na.rm = TRUE)
}
wins$chrn <- sub(".+chr", "", wins$Id)
wins$chrn[ wins$chrn == "X" ] <- 10
wins$chrn <- as.numeric(wins$chrn)
wins$ATs <- 1 - wins$CGs
#wins[1:3, ]
#
cwins <- wins[wins$CGs == 1 & !is.na(wins$CGs) , ]
# cwins <- cwins[ !duplicated( cwins$Id ), ]
# cwins$cent <- cwins$Start + cwins$Win_length/2
Centromere genes
gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#gff[gff$V3 == "remark", ]
#genes[ genes$V1 == cwins$Id[i] & genes$V4 >= cwins$Start[i] & genes$V5 <= cwins$End[i], ]
i <- 1
gff2 <- gff[ gff$V1 == cwins$Id[i] & gff$V4 >= cwins$Start[i] & gff$V5 <= cwins$End[i], ]
for( i in 2:nrow(cwins) ){
# gff2 <- gff[ gff$V1 == cwins$Id[i] & gff$V4 >= cwins$Start[i] & gff$V5 <= cwins$End[i], ]
gff2 <- rbind(gff2, gff[ gff$V1 == cwins$Id[i] & gff$V4 >= cwins$Start[i] & gff$V5 <= cwins$End[i], ])
}
#cbind(1:nrow(gff2), gff2[, 1:8])
#table(gff2$V3)
#gff2[gff2$V3 == "CDS", ]
#gff2[gff2$V3 == "gene", ]
gff2 <- gff2[gff2$V3 == "mRNA", ]
#unlist(lapply(strsplit(gff2$V9, split = ";"), function(x){ grep("desc", x, value = TRUE) }))
# lndmrk <- read.csv("csat_landmark_EH23a_blastn.csv", header = FALSE)
# names(lndmrk) <- c("qseqid","qlen","sseqid","slen","qstart","qend","sstart","send","evalue","bitscore","score","length","pident","nident","mismatch","positive","gapopen","gaps","ppos","sstrand")
# lndmrk$chrn <- sub("^.+chr", "", lndmrk$sseqid)
# lndmrk$chrn[ lndmrk$chrn == "X" ] <- 10
# lndmrk$chrn[ lndmrk$chrn == "Y" ] <- 11
# lndmrk$chrn <- as.numeric(lndmrk$chrn)
#
# lndmrk[1:3, ]
vars <- read.table("ERBxHO40vars.csv.gz", header = TRUE, sep = ",")
nrow(vars)
## [1] 450759
vars <- vars[vars$Missing == 0, ]
nrow(vars)
## [1] 35186
vars[1:3, ]
## CHROM POS n Allele_counts He Ne REFhom HET ALThom
## 48 chr_1e 62350 270 509,31 0.1082236 1.121357 240 29 1
## 102 chr_1e 98473 270 412,128 0.3617010 1.566664 145 122 3
## 220 chr_1e 204226 270 263,277 0.4996639 1.998657 65 133 72
## Missing
## 48 0
## 102 0
## 220 0
vars$ERBallele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[1]})))
vars$HO40allele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[2]})))
vars$ERBallele <- vars$ERBallele / (vars$n * 2)
vars$HO40allele <- vars$HO40allele / (vars$n * 2)
my_chrom <- "chr_1e"
plot( vars$POS[ vars$CHROM == my_chrom ], vars$ERBallele[ vars$CHROM == my_chrom ],
pch = 20, col = "#C0C0C066", ylim = c(0, 1))
points( vars$POS[ vars$CHROM == my_chrom ], vars$HO40allele[ vars$CHROM == my_chrom ],
pch = 20, col = "#4682B4")
vars$chrn <- sub("chr_", "", vars$CHROM)
vars$chrn <- sub("e$", "", vars$chrn)
vars$chrn[ vars$chrn == "X" ] <- 10
vars$chrn <- as.numeric(vars$chrn)
table(vars$chrn)
##
## 1 2 3 4 5 6 7 8 9 10
## 2933 3228 4869 5716 3184 2886 2317 1975 4690 3388
library(ggplot2)
ggplot(vars, aes(x=chrn, y=POS, group=CHROM)) +
geom_point()
my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Set3"), "08", sep="")
#my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Paired"), "08", sep="")
hist(vars$ERBallele[vars$chrn == 2], xlim = c(0, 1))
hist(vars$HO40allele[vars$chrn == 2], xlim = c(0, 1))
p <- ggplot(vars, aes(x=chrn+ERBallele-0.5, y=POS, group=CHROM)) +
geom_point( aes( color = CHROM ) )
p <- p + scale_color_manual(values=my_cols1)
#p <- p + geom_point( data = vars, aes(x = chrn-HO40allele, y=POS), color = vars$chrn)
#
p <- p + geom_point( data = vars, aes(x = chrn+HO40allele-0.5, y=POS), color = my_cols1[vars$chrn])
#p <- p + geom_point( data = vars, aes(x = chrn-0, y=POS), color = my_cols1[vars$chrn])
#p <- p + scale_color_manual(values=my_cols1)
p <- p + theme_bw()
p <- p + theme(legend.position='none')
p <- p + scale_x_continuous(breaks=seq(1, 10, 1))
p <- p + xlab("Chromosome")
p
#p + scale_color_brewer(palette="Dark2")
#p + scale_color_brewer(palette="Set3")
# geom_point( aes( color = CHROM, alpha = 1/10 ) )
# geom_point( aes( alpha = 1/10 ) )
# wins[1:3, ]
# genes[1:3, 1:8]
wins$gcnt <- 0
wins$ty1cnt <- 0
wins$ty3cnt <- 0
for(i in 1:nrow(wins)){
tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
wins$gcnt[i] <- nrow(tmp)
tmp <- gff[gff$V1 == wins$Id[i] & gff$V4 >= wins$Start[i] & gff$V5 < wins$End[i], ]
wins$ty1cnt[i] <- sum(tmp$classification == "Ty1")
wins$ty3cnt[i] <- sum(tmp$classification == "Ty3")
}
wins$gcntsc <- wins$gcnt - min(wins$gcnt)
wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
#range(round( wins$gcntsc * 100 ))
my_index <- round( wins$gcntsc * 100 )
#my_index <- 100 - my_index
my_index[ my_index <= 0] <- 1
#
# #wins$gcol <- heat.colors(n=100)[ my_index ]
# #wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
# #wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
# #wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
# #wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]
#
#
# #wins$gcol <- viridisLite::plasma(n = 100, alpha = 1, begin = 0.1, end = 1)[ my_index ]
# wins$gcol <- viridisLite::magma(n = 100, alpha = 1, begin = 0.2, end = 1.00)[ my_index ]
#
# wins$chr <- sub("^.+chr", "chr", wins$Id)
# #
# wins[1:3, ]
# Cannabinoid BLAST results.
# cann_blst <- read.csv("cann_3dom_EH23a_tblastn.csv", header = FALSE)
# colnames(cann_blst) <- c('qseqid','qlen','sseqid','slen','qstart',
# 'qend','sstart','send','evalue','bitscore',
# 'score','length','pident','nident','mismatch',
# 'positive','gapopen','gaps','ppos','sstrand',
# 'sseq')
# #
# cann_blst <- cann_blst[grep("SignalP|FAD|BBE", cann_blst$qseqid, invert = TRUE, value = F), ]
# cann_blst$name <- sub("^.+_", "", cann_blst$qseqid)
#
# # table(cann_blst$qlen)
# cann_blst <- cann_blst[ cann_blst$gaps <= 0, ]
# cann_blst <- cann_blst[ cann_blst$mismatch <= 10, ]
#
# # write.table(cann_blst, file = "cann_3dom_EH23a_tblastn_filter.csv",
# # sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE)
# # system("~/gits/hempy/bin/blast_to_gff.py cann_3dom_EH23a_tblastn_filter.csv")
#
# cann_blst$chrn <- as.numeric(sub(".+chr", "", cann_blst$sseqid))
#
# #table(cann_blst$qseqid)
#
# # cann_blst[1:3, 1:10]
# # cann_blst[1:3, 10:20]
#
# knitr::kable(cann_blst[, c(1, 2, 3, 7, 13:15, 18, 23)], caption = "**Table 1. Cannabinoid synthase genes.**")
#
#nrow(cann_blst)
# blst <- read.csv("EH23a_blastn.csv", header = FALSE)
# colnames(blst) <- c('qseqid','qlen','sseqid','slen','qstart',
# 'qend','sstart','send','evalue','bitscore',
# 'score','length','pident','nident','mismatch',
# 'positive','gapopen','gaps','ppos','sstrand',
# #'staxids','sblastnames','salltitles',
# 'sseq')#[1:21]
#
# #blst$sseqid <- factor(blst$sseqid, levels = c("EH23a.chr1", "EH23a.chr2", "EH23a.chr3", "EH23a.chr4", "EH23a.chr5", "EH23a.chr6", "EH23a.chr7", "EH23a.chr8", "EH23a.chr9", "EH23a.chrX"))
# #blst <- blst[blst$qseqid == "CsatSD_centromere_370bp", ]
# #table(blst$qseqid)
# # blst[1:3, 1:10]
#
# blst$chrn <- sub(".+chr", "", blst$sseqid)
# blst$chrn[ blst$chrn == "X" ] <- 10
# blst$chrn <- as.numeric(blst$chrn)
#
# subt <- blst[grep("CsatSD_centromere_370bp", blst$qseqid), ]
# cent <- blst[grep("CsatSD_centromere_237bp", blst$qseqid), ]
#
# #blst
# #knitr::kable(blst[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
# knitr::kable(subt[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
# plot_ideo <- function() {
# suppressPackageStartupMessages(require(ggplot2))
#
# # marker_df <- data.frame(
# # chrom = rep(names(map), times = lapply(map, length)),
# # pos = unlist(map),
# # marker = names(unlist(map))
# # )
# # marker_df$chromf <- factor( marker_df$chrom, levels = names(map) )
# # marker_df$chromn <- as.numeric( marker_df$chromf )
#
# # chr_df <- data.frame(
# # start = unlist(lapply(map, min)),
# # end = unlist(lapply(map, max))
# # )
# # chr_df$chr <- names(map)
# # chr_df$chrf <- factor( chr_df$chr, levels = names(map))
# # chr_df$chrn <- as.numeric( chr_df$chrf )
#
# chr_df <- data.frame(
# start = 1,
# end = nucs$Length,
# chr = nucs$Id
# )
# chr_df$chrn <- sub(".+chr", "", nucs$Id)
# chr_df$chrn[chr_df$chrn == "X"] <- 10
# chr_df$chrn <- as.numeric(chr_df$chrn)
#
#
# chrom_wid <- 0.02
# p <- ggplot2::ggplot()
# p <- p + ggplot2::geom_rect( data = chr_df,
# ggplot2::aes( xmin = chrn - chrom_wid,
# xmax = chrn + chrom_wid,
# #xmin = as.numeric(as.factor(chr)) - chrom_wid,
# #xmax = as.numeric(as.factor(chr)) + chrom_wid,
# ymin = end, ymax = start),
# #fill = "#C0C0C0",
# fill = "#DCDCDC",
# #fill = "#F5F5F5",
# color = "#000000"
# )
# #p <- p + scale_y_reverse(limits = c(max_gd, 0))
#
# #wins$Id
# thinw <- 0.28
# p <- p + ggplot2::geom_rect(
# data = wins,
# ggplot2::aes(
# # xmin = chrn - CGs,
# # xmax = chrn + CGs,
# xmin = chrn - ATs * thinw,
# xmax = chrn + ATs * thinw,
# ymin = Start,
# ymax = End),
# fill = wins$gcol,
# #fill = "#0000CD",
# #fill = "#A9A9A9",
# #fill = "#C0C0C0",
# #fill = "#DCDCDC",
# #fill = "#F5F5F5",
# # color = "#000000"
# color = NA
# )
# #p
#
# cmwidth <- 0.4
# p <- p + ggplot2::geom_rect(
# data = cann_blst,
# ggplot2::aes(
# xmin = chrn - cmwidth,
# xmax = chrn + cmwidth,
# ymin = sstart,
# ymax = send),
# #fill = "#0000CD",
# #fill = "#A9A9A9",
# fill = "#C0C0C0",
# #fill = "#DCDCDC",
# #fill = "#F5F5F5",
# #color = "#000000"
# color = "#228B22"
# #color = NA
# )
# #p
#
#
# # p <- p + annotate(geom="text", x=7.5, y=11.3e6, label="THCAS1",
# # color="#228B22", size = 3)
# # p <- p + annotate(geom="text", x=7.5, y=12.1e6, label="CBDAS2",
# # color="#0000FF", size = 3)
# # p <- p + annotate(geom="text", x=7.5, y=30.7e6, label="CBCAS",
# # color="#B22222", bg = "#ffffff", size = 3)
# #p
#
# stwidth <- 0.40
# p <- p + ggplot2::geom_rect(
# data = subt,
# ggplot2::aes(
# xmin = chrn - stwidth,
# xmax = chrn + stwidth,
# #xmax = chrn,
# ymin = sstart,
# ymax = send),
# #fill = "#0000CD",
# #fill = "#A9A9A9",
# fill = "#C0C0C0",
# #fill = "#DCDCDC",
# #fill = "#F5F5F5",
# #color = "#000000"
# color = "#0000FF"
# #color = "#228B22"
# #color = NA
# )
# #p
#
# mwidth <- 0.6
# p <- p + ggplot2::geom_rect(
# data = cent,
# ggplot2::aes(
# xmin = chrn - mwidth,
# # xmax = chrn + mwidth,
# xmax = chrn - 0.2,
# ymin = sstart,
# ymax = send),
# #fill = "#0000CD",
# #fill = "#A9A9A9",
# fill = "#C0C0C0",
# #fill = "#DCDCDC",
# #fill = "#F5F5F5",
# color = "#000000"
# #color = "#0000FF"
# #color = "#8A2BE2"
# #color = "#228B22"
# #color = NA
# )
# #p
#
#
# # mwidth <- 0.4
# # p <- p + ggplot2::geom_rect(
# # data = Ty3,
# # ggplot2::aes(
# # xmin = chrn - 0,
# # # xmin = chrn - mwidth,
# # xmax = chrn + mwidth,
# # # xmax = chrn,
# # ymin = V4,
# # ymax = V5),
# # fill = "#0000FF",
# # color = NA
# # )
# # #p
#
#
# # marker_wid <- 0.1
# # marker_high <- 0.4
# # p <- p + ggplot2::geom_rect( data = marker_df,
# # ggplot2::aes( xmin = chromn - marker_wid,
# # xmax = chromn + marker_wid,
# # #xmin = as.numeric(as.factor(chrom)) - marker_wid,
# # #xmax = as.numeric(as.factor(chrom)) + marker_wid,
# # ymin = pos - marker_high, ymax = pos + marker_high),
# # fill = "#228B22", color = "#228B22"
# # )
# #
# # p <- p + scale_y_reverse( breaks = seq(0, 2e3, by = 100) )
# #p <- p + ggplot2::scale_y_reverse( minor_breaks = seq(0, 2e3, by = 20), breaks = seq(0, 2e3, by = 100) )
# #p <- p + scale_x_continuous( breaks = as.numeric(as.factor(chr_df$chr) ) )
# p <- p + ggplot2::scale_x_continuous( breaks = chr_df$chrn, labels = chr_df$chr )
# # p <- p + scale_y_reverse(limits = c(max_gd, 0))
# # p <- p + scale_y_reverse(limits = c(0, max_gd))
# #p <- p + theme_bw() + theme( panel.grid.minor = element_blank() )
# p <- p + ggplot2::theme_bw() +
# ggplot2::theme( panel.grid.minor.x = ggplot2::element_blank(),
# axis.text.x = element_text(angle = 60, hjust = 1),
# axis.title.x=element_blank(),
# panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
# panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
# #panel.grid.major.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 1 ),
# #panel.grid.minor.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 3 )
# )
# # p <- p + ggplot2::xlab("Chromosome")
#
# #p <- p + ylab("Distance (cM)")
# #p <- p + ggplot2::ylab("Location (cM)")
# p <- p + ggplot2::ylab("Position (bp)")
# #p
#
#
# #p
#
# #return( invisible( NULL ) )
# p
# }
# p1 <- plot_ideo()
# p1
#lndmrk[1:3, ]
#nrow(lndmrk)
#table(lndmrk$qseqid)
# pmildew <- "PNW39_LH3804_locus"
# lgrp <- system( 'grep "^>" ../csat_landmark_dna.fa', intern = TRUE )
# lgrp <- sub("^>", "", lgrp)
# #lgrp
#
# # Cann
# my_regex <- sub("[[:space:]].+", "", lgrp[1:6])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# Cent, subTelo
#lgrp[7:9]
# my_regex <- sub("[[:space:]].+", "", lgrp[7:9])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
# table(lndmrk2$qseqid)
#
# chrom_wid <- 0.46
# tmp <- data.frame(
# xmin = lndmrk2$chrn - chrom_wid,
# xmax = lndmrk2$chrn + chrom_wid,
# ymin = lndmrk2$sstart,
# ymax = lndmrk2$send
# )
#
# p1 + ggplot2::geom_rect(
# data = tmp,
# ggplot2::aes(
# xmin = xmin,
# xmax = xmax,
# ymin = ymin,
# ymax = ymax
# ),
# fill = "#DCDCDC",
# # color = "#CD853F",
# color = "#F08080",
# # color = "#000000"
# linewidth = 2
# ) + ggtitle("Sub-telo, centromeric motifs")
# P mildew
#lgrp[10:10]
# my_regex <- sub("[[:space:]].+", "", lgrp[10:10])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
#
# chrom_wid <- 0.46
# tmp <- data.frame(
# xmin = lndmrk2$chrn - chrom_wid,
# xmax = lndmrk2$chrn + chrom_wid,
# ymin = lndmrk2$sstart,
# ymax = lndmrk2$send
# )
#
# p1 + ggplot2::geom_rect(
# data = tmp,
# ggplot2::aes(
# xmin = xmin,
# xmax = xmax,
# ymin = ymin,
# ymax = ymax
# ),
# fill = "#DCDCDC",
# # color = "#CD853F",
# color = "#F08080",
# # color = "#000000"
# linewidth = 2
# ) + ggtitle("Powdery mildew")
# # CENP
# grep("NC_044371.1:c59909808-59908167 Cannabis sativa chromosome 1, cs10, whole genome shotgun sequence", lgrp):grep("NC_044371.1:c74285628-74284242 Cannabis sativa chromosome 1, cs10, whole genome shotgun sequence", lgrp)
# #lgrp[11:26]
# my_regex <- sub("[[:space:]].+", "", lgrp[11:26])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
#
# chrom_wid <- 0.46
# tmp <- data.frame(
# xmin = lndmrk2$chrn - chrom_wid,
# xmax = lndmrk2$chrn + chrom_wid,
# ymin = lndmrk2$sstart,
# ymax = lndmrk2$send
# )
#
# p1 + ggplot2::geom_rect(
# data = tmp,
# ggplot2::aes(
# xmin = xmin,
# xmax = xmax,
# ymin = ymin,
# ymax = ymax
# ),
# fill = "#DCDCDC",
# # color = "#CD853F",
# color = "#F08080",
# # color = "#000000"
# linewidth = 2
# ) + ggtitle("CENP")
# # rRNA
# grep("NC_044375.1:87745002-87745120 Cannabis sativa chromosome 2, cs10, whole genome shotgun sequence", lgrp):grep("NC_026562.1:100368-101858 Cannabis sativa cultivar Carmagnola chloroplast, complete genome", lgrp)
# #lgrp[27:55]
# my_regex <- sub("[[:space:]].+", "", lgrp[27:55])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
#
# lndmrk2 <- lndmrk2[ lndmrk2$gaps == 0, ]
# lndmrk2 <- lndmrk2[ lndmrk2$pident >= 95, ]
#
# #lndmrk2 <- lndmrk2[grep("^NC_044", lndmrk2$qseqid), ]
#
# tbl2 <- table(lndmrk2$chrn, lndmrk2$qseqid)
# tbl2[, colSums(as.matrix(tbl2)) > 1e3]
# #tbl2['8',]
#
# #tbl2[, grep("^NW", colnames(tbl2))]
#
# table(lndmrk2$chrn)
#
# # OAS
# #lgrp[56:57]
# #my_regex <- sub("[[:space:]].+", "", lgrp[56:57])
# #my_regex <- paste(my_regex, collapse="|")
# #lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
#
#
# #tmp <- data.frame( xmin = 1:4, xmax = 2:5, ymin = rep(10, times = 4), ymax = rep(10000000, times = 4))
#
# chrom_wid <- 0.46
# tmp <- data.frame(
# xmin = lndmrk2$chrn - chrom_wid,
# xmax = lndmrk2$chrn + chrom_wid,
# ymin = lndmrk2$sstart,
# ymax = lndmrk2$send
# )
#
# p1 + ggplot2::geom_rect(
# data = tmp,
# ggplot2::aes(
# xmin = xmin,
# xmax = xmax,
# ymin = ymin,
# ymax = ymax
# ),
# #fill = "#C0C0C0",
# fill = "#DCDCDC",
# #fill = "#F5F5F5",
# # color = "#FFD700"
# # color = "#DAA520",
# color = "#CD853F",
# # color = "#000000"
# linewidth = 2
# ) + ggtitle("rRNA")
#
# #ggsave(filename = "EH23a_ideo.png", device = "png", width = 6.5, height = 6.5, units = "in", dpi = 300)
#
# # OAS
# #lgrp[56:57]
# my_regex <- sub("[[:space:]].+", "", lgrp[56:57])
# my_regex <- paste(my_regex, collapse="|")
# lndmrk2 <- lndmrk[grep(my_regex, lndmrk$qseqid), ]
#
# chrom_wid <- 0.46
# tmp <- data.frame(
# xmin = lndmrk2$chrn - chrom_wid,
# xmax = lndmrk2$chrn + chrom_wid,
# ymin = lndmrk2$sstart,
# ymax = lndmrk2$send
# )
#
# p1 + ggplot2::geom_rect(
# data = tmp,
# ggplot2::aes(
# xmin = xmin,
# xmax = xmax,
# ymin = ymin,
# ymax = ymax
# ),
# fill = "#DCDCDC",
# # color = "#CD853F",
# color = "#F08080",
# # color = "#000000"
# linewidth = 2
# ) + ggtitle("OAS")
my_data <- readr::read_csv("gene_counts.csv", )
## Rows: 69 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Sample
## dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX, chrY
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# names(my_data)[1] <- "Sample"
# my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
## Sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH3Ma 4052 2763 2586 3189 2424 2535 2058 2892 2610 3689 NA
## 2 AH3Mb 4062 2807 2620 3224 2563 2604 2080 2967 2509 NA 2850
## 3 BCMa 4137 2878 2682 3256 2597 2599 2031 2953 2654 3792 NA
## 4 BCMb 4059 2703 2611 3098 2601 2615 1986 2905 2569 NA 2751
## 5 BOAXa 4097 2877 2680 3147 2444 2718 2015 2950 2612 3778 NA
## 6 BOAXb 4039 2816 2507 3159 2499 2586 1995 2914 2551 3670 NA
## 7 COFBa 4365 2681 2552 3035 2404 2457 1793 2840 2483 3625 NA
## 8 COFBb 4078 2960 2755 3300 2768 2750 2212 3058 2769 3944 NA
## 9 COSVa 4063 2834 2677 3130 2595 2580 2048 2940 2594 3722 NA
## 10 COSVb 4049 2846 2690 3175 2601 2645 2029 2977 2566 3801 NA
## # ℹ 59 more rows
library(tidyr)
data_long <- my_data %>%
pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Count")
#data_long$Length <- data_long$Length / 1e6
gcount <- data_long
sum(is.na(data_long$Count))
## [1] 69
data_long <- data_long[!is.na(data_long$Count), ]
data_long
## # A tibble: 690 × 3
## Sample Chrom Count
## <chr> <chr> <dbl>
## 1 AH3Ma chr1 4052
## 2 AH3Ma chr2 2763
## 3 AH3Ma chr3 2586
## 4 AH3Ma chr4 3189
## 5 AH3Ma chr5 2424
## 6 AH3Ma chr6 2535
## 7 AH3Ma chr7 2058
## 8 AH3Ma chr8 2892
## 9 AH3Ma chr9 2610
## 10 AH3Ma chrX 3689
## # ℹ 680 more rows
my_data <- readr::read_csv("chrom_lengths.csv")
## New names:
## Rows: 69 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX,
## chrY
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(my_data)[1] <- "Sample"
my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
## Sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH3Ma 6.57e7 7.65e7 8.20e7 8.16e7 7.65e7 7.63e7 6.64e7 5.41e7 6.60e7 8.42e7
## 2 AH3Mb 6.63e7 7.62e7 8.11e7 8.09e7 7.80e7 7.64e7 6.65e7 5.52e7 6.50e7 NA
## 3 BCMa 6.51e7 7.58e7 8.62e7 8.12e7 7.67e7 7.53e7 6.55e7 5.48e7 6.49e7 8.33e7
## 4 BCMb 6.32e7 7.83e7 8.16e7 7.84e7 7.69e7 7.68e7 6.44e7 5.42e7 6.55e7 NA
## 5 BOAXa 6.53e7 7.96e7 8.19e7 7.80e7 7.64e7 7.91e7 6.40e7 5.48e7 6.53e7 8.32e7
## 6 BOAXb 6.50e7 7.76e7 7.95e7 7.97e7 7.57e7 7.58e7 6.49e7 5.44e7 6.60e7 8.26e7
## 7 COFBa 7.21e7 7.85e7 8.44e7 8.21e7 7.76e7 7.97e7 6.37e7 5.57e7 6.55e7 8.57e7
## 8 COFBb 6.47e7 7.71e7 8.21e7 8.16e7 7.77e7 7.71e7 6.84e7 5.52e7 6.63e7 8.46e7
## 9 COSVa 6.76e7 8.34e7 8.63e7 8.52e7 8.09e7 7.93e7 6.99e7 5.53e7 6.81e7 8.65e7
## 10 COSVb 6.61e7 8.15e7 8.74e7 8.40e7 8.46e7 8.30e7 6.50e7 5.75e7 6.78e7 8.58e7
## # ℹ 59 more rows
## # ℹ 1 more variable: chrY <dbl>
library(tidyr)
data_long <- my_data %>%
pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Length")
data_long$Length <- data_long$Length / 1e6
data_long
## # A tibble: 759 × 3
## Sample Chrom Length
## <fct> <chr> <dbl>
## 1 AH3Ma chr1 65.7
## 2 AH3Ma chr2 76.5
## 3 AH3Ma chr3 82.0
## 4 AH3Ma chr4 81.6
## 5 AH3Ma chr5 76.5
## 6 AH3Ma chr6 76.3
## 7 AH3Ma chr7 66.4
## 8 AH3Ma chr8 54.1
## 9 AH3Ma chr9 66.0
## 10 AH3Ma chrX 84.2
## # ℹ 749 more rows
glength <- data_long
glength
## # A tibble: 759 × 3
## Sample Chrom Length
## <fct> <chr> <dbl>
## 1 AH3Ma chr1 65.7
## 2 AH3Ma chr2 76.5
## 3 AH3Ma chr3 82.0
## 4 AH3Ma chr4 81.6
## 5 AH3Ma chr5 76.5
## 6 AH3Ma chr6 76.3
## 7 AH3Ma chr7 66.4
## 8 AH3Ma chr8 54.1
## 9 AH3Ma chr9 66.0
## 10 AH3Ma chrX 84.2
## # ℹ 749 more rows
gcount
## # A tibble: 759 × 3
## Sample Chrom Count
## <chr> <chr> <dbl>
## 1 AH3Ma chr1 4052
## 2 AH3Ma chr2 2763
## 3 AH3Ma chr3 2586
## 4 AH3Ma chr4 3189
## 5 AH3Ma chr5 2424
## 6 AH3Ma chr6 2535
## 7 AH3Ma chr7 2058
## 8 AH3Ma chr8 2892
## 9 AH3Ma chr9 2610
## 10 AH3Ma chrX 3689
## # ℹ 749 more rows
data_long <- cbind(glength, gcount$Count)
names(data_long)[4] <- "Count"
data_long[1:3, ]
## Sample Chrom Length Count
## 1 AH3Ma chr1 65.67137 4052
## 2 AH3Ma chr2 76.48150 2763
## 3 AH3Ma chr3 81.99513 2586
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")
library(ggplot2)
# Basic scatter plot
p <- ggplot(data_long, aes(x=Length, y=Count, color=Chrom))
p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)
p <- p + theme_bw()
p <- p + theme(legend.title = element_blank())
#p <- p + theme(legend.position = "left")
p <- p + theme(legend.position = "right")
#p <- p + theme( legend.spacing.y = unit(1.0, 'mm') )
## important additional element
#p <- p + guides(fill = guide_legend(byrow = TRUE))
p + theme(legend.spacing.y = unit(1.0, 'mm')) +
guides(fill = guide_legend(byrow = TRUE))
## `geom_smooth()` using formula = 'y ~ x'
#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Chromosome length (Mbp)")
p <- p + ylab("Genes per chromosome")
p2 <- p
p2
## `geom_smooth()` using formula = 'y ~ x'
wins <- read.csv("EH23a.softmasked_wins1e6.csv")
gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#gff[1:3, 1:8]
genes <- gff[ gff[, 3] == "gene", ]
gff <- read.table("../Figure1/EH23a.unmasked.fasta.mod.EDTA.intact.gff3", sep = "\t", quote = "\"")
names(gff) <- c("seqid","source","type","start","end","score","strand","phase","attributes")
#gff$chrn <- sub("^.+chr", "", gff$V1)
gff$chrn <- sub("^.+chr", "", gff$seqid)
gff$chrn[ gff$chrn == "X" ] <- 10
gff$chrn <- as.numeric(gff$chrn)
gff$classification <- unlist(lapply(strsplit(gff[,9], split = ";"), function(x){ grep("Classification=", x, value = TRUE) }))
gff$classification <- sub("^Classification=", "", gff$classification)
gff$classification <- sub("^LTR/Copia", "Ty1", gff$classification)
gff$classification <- sub("^LTR/Gypsy", "Ty3", gff$classification)
#
gff[1:8, c(1:8, 10)]
## seqid source type start end score strand phase
## 1 EH23a.chr1 EDTA repeat_region 48463 58586 . - .
## 2 EH23a.chr1 EDTA target_site_duplication 48463 48467 . - .
## 3 EH23a.chr1 EDTA long_terminal_repeat 48468 50359 . - .
## 4 EH23a.chr1 EDTA Copia_LTR_retrotransposon 48468 58581 . - .
## 5 EH23a.chr1 EDTA long_terminal_repeat 56690 58581 . - .
## 6 EH23a.chr1 EDTA target_site_duplication 58582 58586 . - .
## 7 EH23a.chr1 EDTA repeat_region 67125 71966 . + .
## 8 EH23a.chr1 EDTA target_site_duplication 67125 67129 . + .
## chrn
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
## 7 1
## 8 1
# Ty3
Ty3 <- gff[ gff$classification == "Ty3", ]
Ty3 <- Ty3[ Ty3$type == "Gypsy_LTR_retrotransposon", ]
#Ty3 <- Ty3[ Ty3$V3 == "Gypsy_LTR_retrotransposon", ]
Ty3[1:8, c(1:8, 10:11)]
## seqid source type start end score strand
## 81 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2110985 2122540 . ?
## 87 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2180122 2191426 . +
## 161 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3614107 3626646 . -
## 206 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3905221 3916714 . ?
## 228 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4168059 4180534 . -
## 242 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4447228 4458755 . -
## 283 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4885633 4891899 . -
## 305 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 5232888 5239051 . +
## phase chrn classification
## 81 . 1 Ty3
## 87 . 1 Ty3
## 161 . 1 Ty3
## 206 . 1 Ty3
## 228 . 1 Ty3
## 242 . 1 Ty3
## 283 . 1 Ty3
## 305 . 1 Ty3
wins$gcnt <- 0
#wins$ty1cnt <- 0
wins$ty3cnt <- 0
for(i in 1:nrow(wins)){
tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
wins$gcnt[i] <- nrow(tmp)
#tmp <- gff[gff$V1 == wins$Id[i] & gff$V4 >= wins$Start[i] & gff$V5 < wins$End[i], ]
tmp <- Ty3[Ty3$seqid == wins$Id[i] & Ty3$start >= wins$Start[i] & Ty3$end < wins$End[i], ]
wins$ty3cnt[i] <- nrow(tmp)
# wins$ty1cnt[i] <- sum(tmp$classification == "Ty1")
}
wins[1:3, ]
## Id Win_number Start End Win_length A C G T
## 1 EH23a.chr1 0 0 999999 1000000 339510 159349 157789 343352
## 2 EH23a.chr1 1 1000000 1999999 1000000 346184 152100 152139 349577
## 3 EH23a.chr1 2 2000000 2999999 1000000 343278 155236 154692 346794
## CG CHG CHH gcnt ty3cnt
## 1 14003 19695 90544 156 0
## 2 13018 18441 88693 141 0
## 3 14100 19606 89229 123 2
hist(wins$ty3cnt)
#plot(wins$gcnt, wins$ty3cnt)
#plot(wins$gcnt, wins$ty1cnt)
# wins[1:3, ]
# table(wins$Id)
# my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
# #my_pal[11] <- "#B15928"
# my_pal[11] <- "#C71585"
# #my_pal <- paste(my_pal, "66", sep = "")
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")
lm1 <- lm( wins$ty3cnt ~ wins$gcnt )
summary(lm1)
##
## Call:
## lm(formula = wins$ty3cnt ~ wins$gcnt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4540 -1.9798 0.0355 1.7483 10.8191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.453983 0.182058 51.93 <2e-16 ***
## wins$gcnt -0.067007 0.003699 -18.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.905 on 742 degrees of freedom
## Multiple R-squared: 0.3066, Adjusted R-squared: 0.3057
## F-statistic: 328.1 on 1 and 742 DF, p-value: < 2.2e-16
my_coefs <- round(lm1$coefficients, digits = 3)
#wins[1:3, ]
library(ggplot2)
# Basic scatter plot
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=Id))
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=chr))
p <- ggplot( data = wins, aes(x=gcnt, y=ty3cnt, color = Id))
#p <- p + geom_point(size=2, color = "#C0C0C0")
#p <- p + geom_point(size=2, color = "#77889966")
# p <- p + geom_point(size=2, color = "#70809044")
#
p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
#p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 2)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)
#p <- p + geom_text(x=100, y=90, label="y = (-0.4)x + 56.7", size = 4)
p <- p + geom_text(x=100, y=15, label= paste("y = ", my_coefs[2], "x + ", my_coefs[1], sep = ""), color = "#000000", size = 4, parse = F)
p <- p + theme_bw()
p <- p + theme(legend.position = "none")
#p <- p + theme(legend.title = element_blank())
#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
# p <- p + xlab("Genes per 1 Mbp window")
# p <- p + ylab("Ty3 elements per 1 Mbp window")
p <- p + xlab("Genes per window")
p <- p + ylab("Ty3 elements window")
# p
p3 <- p
p3
## `geom_smooth()` using formula = 'y ~ x'
# lm2 <- lm( wins$ty1cnt ~ wins$gcnt )
# summary(lm2)
source("../FigureSideo/Rfunctions.R")
p1 <- plot_ideo("/media/knausb/E737-9B48/releases/scaffolded//EH23a")
#library(magick)
#ggdraw() +
# cowplot::draw_image("https://i.stack.imgur.com/WDOo4.jpg?s=328&g=1") #+
# draw_plot(my_plot)
library("ggpubr")
ggarrange(
plotlist = list(p1,
ggarrange(p3, p2,
ncol = 2, nrow = 1,
widths = c(2, 3),
labels = c("B", "C"))
),
labels = c("A", ""),
ncol = 1, nrow = 2,
widths = 1, heights = c(2, 1))
Figure 1. Genomic architecture of the Cannabis sativa genome.
Figure 1. Genomic architecture of the Cannabis sativa genome. (A) Ideogram of the ERBxHO40_23 haplotype A genome. Each chromosome is divided into 1 Mbp windows where each window’s width is proportional to the inverse abundance of the motif ‘CG’ and each window is colored according to gene density. Windows with high ‘CG’ abundance are narrow and windows with high gene counts are yellow. The plant ERBxHO40_23 resulted from a cross between strains ERB (chemotype III or CBD dominant) and HO40 (chemotype I or THC dominant). The genetically female plant HO40 (XX) was chemically masculinized to produce pollen flowers to facilitate the cross. Subtelomeric repeat motifs are marked with blue horizontal lines. Cannabinoid synthase genes are marked with green horizontal lines on chromosome seven. (B) The abundances of genes and Ty3 LTRs are inversely related. Each dot represents a 1 Mbp window from the ERBxHO40_23 haplotype A assembly, organized by the number of genes and Ty3 LTR elements contained in each window. Windows with high quantities of genes have low Ty3 LTR counts. The genes in ERBxHO40 are predominantly near the ends of each chromosome, while the central portion of each chromosome appears populated by Ty3 LTR elements. (C) Long chromosomes have more genes, however some chromosomes demonstrate exceptional gene content. Chromosome Y is the longest chromosome but includes a number of genes that is comparable to other chromosomes. Each dot represents a chromosome from phased and assembled haplotypes (n = 69; X = 65; Y = 4). Chromosome 1 is of average length but contains more genes than other chromosomes of a similar length. Chromosomes X and 8 also have more genes than chromosomes of similar length. Chromosome 7, where the cannabinoid synthases (CBCAS, CBDAS, THCAS) are found, has the lowest number of genes.
afreq <- read.csv("/media/knausb/Vining_lab/knausb/GENOMES_NRQ_2267/ERBxHO40_hapERB/het_wins/allele_freqs.csv")
#hist(afreq$countNA)
#afreq <- afreq[afreq$countNA <= 2, ]
#
afreq <- afreq[afreq$countNA <= 0, ]
afreq[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1
## 13 chr_1e 62350 270 509,31 0.1082236 1.121357 240 29
## 30 chr_1e 98473 270 412,128 0.3617010 1.566664 145 122
## 54 chr_1e 204226 270 263,277 0.4996639 1.998657 65 133
## count1.1 countNA ERBcount HO40count ERBfreq HO40freq He2 POS2
## 13 1 0 509 31 0.9425926 0.05740741 0.1074074 62350
## 30 3 0 412 128 0.7629630 0.23703704 0.4518519 98473
## 54 72 0 263 277 0.4870370 0.51296296 0.4925926 204226
EH23a <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
EH23a$Frequency <- EH23a$ERBfreq
EH23a$facet1 <- "EH23a"
EH23b <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
EH23b$Frequency <- EH23a$HO40freq
EH23b$facet1 <- "EH23b"
Het <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0",
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq",
"HO40freq", "He2", "POS2")]
Het$Frequency <- EH23a$He2
Het$facet1 <- "Het"
afreq <- rbind(EH23a, EH23b, Het)
afreq$facet2 <- afreq$facet1
afreq$facet2 <- sub("EH23[ab]", "EH23", afreq$facet2)
afreq$facet2[ afreq$facet2 == "EH23" ] <- "Allele"
afreq$facet2[ afreq$facet2 == "Het" ] <- "Heterozygosity"
afreq[1:3, ]
## CHROM POS n Allele_counts He Ne count0.0 count0.1
## 13 chr_1e 62350 270 509,31 0.1082236 1.121357 240 29
## 30 chr_1e 98473 270 412,128 0.3617010 1.566664 145 122
## 54 chr_1e 204226 270 263,277 0.4996639 1.998657 65 133
## count1.1 countNA ERBcount HO40count ERBfreq HO40freq He2 POS2
## 13 1 0 509 31 0.9425926 0.05740741 0.1074074 62350
## 30 3 0 412 128 0.7629630 0.23703704 0.4518519 98473
## 54 72 0 263 277 0.4870370 0.51296296 0.4925926 204226
## Frequency facet1 facet2
## 13 0.9425926 EH23a Allele
## 30 0.7629630 EH23a Allele
## 54 0.4870370 EH23a Allele
library(ggplot2)
#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.01)
my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)
#as.numeric(as.factor(afreq$CHROM))
#afreq$col2 <- viridisLite::magma( n = 10, alpha = 0.8, begin = 0, end = 1 )[as.numeric(as.factor(afreq$CHROM))]
p <- ggplot( data = afreq,
mapping = aes( x = POS2,
#y = ERBfreq,
#y = freq,
y = Frequency,
color = CHROM )
#color = col2)
)
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + geom_hline( yintercept = 0.5, linetype = 1, color = "#000000", linewidth = 1 )
#p + scale_color_manual( values = viridisLite::magma( n = 10, alpha = 0.01, begin = 0.2, end = 0.9 ) )
p <- p + ylim(0.3, 0.7)
p <- p + scale_color_manual( values = my_pal )
#
#p <-
p + facet_grid(facet1 ~ .)
#p <-
ahplot <- p + facet_grid(facet2 ~ .)
#p
ahplot
#p + scale_color_manual( values = afreq$col2 )
#+ scale_color_viridis_b()
library("ggpubr")
ggarrange(
plotlist = list(p1, ahplot),
labels = c("A", "B"),
ncol = 1, nrow = 2,
widths = 1, heights = c(2, 1))
Figure 1. Genomic architecture of the Cannabis sativa genome.